### Analysis script
### load with UTF 8 encoding

library(quanteda)
library(quanteda.textmodels)
library(quanteda.textstats)
library(quanteda.textplots)
library(SnowballC)
library(stm)
library(tidyverse)
library(haven)

### Set max print
options(max.print=1000000)

### load in data
setwd()
load("Final_AFD_Data.RData")
load("Wiki_Total.Rdata")

### create corpus
corpus_AFD<- corpus(Final_AFD_Data, text_field = "content", docid_field = "ID")
corpus_AFD

### create tokens object
tokens_AFD <- tokens(corpus_AFD, remove_punct = TRUE, remove_numbers = TRUE, remove_symbols = TRUE, remove_url = TRUE) %>% 
  tokens_remove(pattern = c(stopwords("de", source = "marimo"),"immer", "schon", "dass")) %>% 
  tokens_keep(pattern = "^[\\p{script=Latn}]+$", valuetype = "regex")
tokens_AFD

### set compound terms
tokens_AFD_comp <- tokens_compound(tokens_AFD, pattern = phrase(c("Frau Merkel", 
                                                                  "Dr Alexander",
                                                                  "Deutschen Bundestag",
                                                                  "Prof Dr",
                                                                  "Bernd Lucke",
                                                                  "Dr Alice",
                                                                  "Lucke Sprecher",
                                                                  "Europäischen Union",
                                                                  "Alexander Gauland",
                                                                  "Martin Schulz",
                                                                  "Europäische Union",
                                                                  "Europäischen Parlament",
                                                                  "Kanzlerin Merkel",
                                                                  "Joachim Starbatty",
                                                                  "Europäischen Zentralbank",
                                                                  "Paul Hampel",
                                                                  "Bundeskanzlerin Angela",
                                                                  "Europäischen Parlaments",
                                                                  "Angela Merkel",
                                                                  "Stephan Protschka",
                                                                  "Konrad Adam",
                                                                  "Andreas Kalbitz",
                                                                  "Wolfgang Schäuble",
                                                                  "Deutschen Bundestages",
                                                                  "Brandenburger Tor",
                                                                  "Mario Draghi",
                                                                  "Europäische Parlament",
                                                                  "Prof Bernd",
                                                                  "Joachim Gauck",
                                                                  "Bundesfinanzminister Schäuble",
                                                                  "Großen Koalition",
                                                                  "Deutsche Bank",
                                                                  "Kay Gottschalk",
                                                                  "Donald Trump",
                                                                  "Europäische Zentralbank",
                                                                  "Albrecht Glaser",
                                                                  "Bundesrepublik Deutschland",
                                                                  "Frank Pasemann",
                                                                  "Horst Seehofer",
                                                                  "Alice Weidel",
                                                                  "Adam Sprecher",
                                                                  "Justizminister Maas",
                                                                  "Finanzminister Schäuble",
                                                                  "Nahen Osten",
                                                                  "Sachsen Brandenburg",
                                                                  "Helmut Schmidt",
                                                                  "Christian Lindner",
                                                                  "Volker Kauder",
                                                                  "Große Koalition",
                                                                  "Christian Schäfer",
                                                                  "Bundesfinanzminister Wolfgang",
                                                                  "Peter Altmaier",
                                                                  "Europäischen Gerichtshof",
                                                                  "Manfred Weber",
                                                                  "Herr Juncker",
                                                                  "Landtag Sachsen-Anhalt",
                                                                  "Bundeskanzlerin Merkel",
                                                                  "Innenminister Seehofer",
                                                                  "Roland Klaus",
                                                                  "Jens Spahn",
                                                                  "Bundesjustizminister Heiko",
                                                                  "Uwe Junge",
                                                                  "Europäischen Kommission",
                                                                  "Peter Tauber",
                                                                  "Präsident Trump",
                                                                  "Europäischen Konservativen",
                                                                  "Bundesfinanzminister Olaf",
                                                                  "Sigmar Gabriel",
                                                                  "Hans-Georg Maaßen",
                                                                  "Politischen Leitlinien",
                                                                  "Ralf Stegner",
                                                                  "Berliner Senat",
                                                                  "Verteidigungsministerin Ursula",
                                                                  "Präsidenten Macron",
                                                                  "Frau Kramp-Karrenbauer",
                                                                  "Bischof Dröge",
                                                                  "David Cameron",
                                                                  "Frau Merkels",
                                                                  "Junge Freiheit",
                                                                  "Frauke Petry",
                                                                  "Herr Gabriel",
                                                                  "Präsident Erdogan",
                                                                  "Bundesfinanzminister Scholz",
                                                                  "Präsidenten Erdogan",
                                                                  "Dirk Driesang",
                                                                  "Mario Draghis",
                                                                  "Gunnar Beck",
                                                                  "Berliner Zeitung",
                                                                  "Boris Palmer",
                                                                  "Klaus Wowereit",
                                                                  "Herrn Juncker",
                                                                  "Bundesjustizminister Maas",
                                                                  "Frankfurter Allgemeine",
                                                                  "Prof Joachim",
                                                                  "Roman Herzog",
                                                                  "Uwe Wurlitzer",
                                                                  "Jens Weidmann",
                                                                  "Joachim Kuhs",
                                                                  "Europäische Bankenunion",
                                                                  "Marcus Pretzell",
                                                                  "Dritten Welt",
                                                                  "Verena Brüdigam",
                                                                  "Frau Nahles",
                                                                  "Frau Roth",
                                                                  "Präsident Hollande",
                                                                  "Sachsen Thüringen",
                                                                  "Staaten Europas",
                                                                  "Alexander Dilger",
                                                                  "Sozialen Medien",
                                                                  "Antonio Stiftung",
                                                                  "Herrn Draghi",
                                                                  "Herr Präsident",
                                                                  "Di Fabio",
                                                                  "Emmanuel Macron",
                                                                  "Deutschen Instituts",
                                                                  "Andrea Nahles",
                                                                  "Kalten Krieges",
                                                                  "Frank-Walter Steinmeier",
                                                                  "Frank Magnitz",
                                                                  "Europäischen Stabilitätsmechanismus",
                                                                  "Viktor Orbán",
                                                                  "Annegret Kramp-Karrenbauer",
                                                                  "Dr Gunnar",
                                                                  "Deutsche Bundestag",
                                                                  "Anne Will",
                                                                  "Graf Lambsdorff",
                                                                  "Europäischen Verträge",
                                                                  "Wochenzeitung Junge",
                                                                  "Deutschen Bank",
                                                                  "Europäischen Parlamentes",
                                                                  "Guido Reil",
                                                                  "Christine Lagarde",
                                                                  "Matteo Salvini",
                                                                  "Deutschen Industrie",
                                                                  "Demokratie Rechtsstaatlichkeit",
                                                                  "Frau Fahimi",
                                                                  "Dr Roland",
                                                                  "Dr Frauke",
                                                                  "Weimarer Republik",
                                                                  "Herr Schäuble",
                                                                  "Berliner Abgeordnetenhaus",
                                                                  "Herr Seehofer",
                                                                  "Deutschen Umwelthilfe",
                                                                  "André Poggenburg",
                                                                  "Herr Henkel",
                                                                  "Global Compact",
                                                                  "Die Linke",
                                                                  "Gender Mainstreaming",
                                                                  "Die Grünen",
                                                                  "Hans-Olaf Henkel",
                                                                  "Olaf Scholz",
                                                                  "Claudia Roth",
                                                                  "Boris Johnson",
                                                                  "Nicolaus Fest",
                                                                  "Amadeu Antonio",
                                                                  "Michaela Merz",
                                                                  "Peer Steinbrück",
                                                                  "Green Deal",
                                                                  "Claus Schenk",
                                                                  "Kenneth Rogoff",
                                                                  "Die Liste",
                                                                  "Manuela Schwesig",
                                                                  "AfD Sachsen",
                                                                  "Wirtschaftsstandort Deutschland",
                                                                  "Schenk Graf",
                                                                  "Sarah Wagenknecht",
                                                                  "Heiko Maas",
                                                                  "Frau Beer",
                                                                  "Roland Hartwig",
                                                                  "Jean-Claude Juncker",
                                                                  "Otto Graf",
                                                                  "Evangelischen Kirche",
                                                                  "Pierre Moscovici",
                                                                  "Frankfurter Allgemeinen",
                                                                  "Julian Flak",
                                                                  "Hans-Werner Sinn",
                                                                  "Stephan Brandner",
                                                                  "Regierung Merkel",
                                                                  "Political Correctness",
                                                                  "Frau Petry",
                                                                  "Dr Bernd",
                                                                  "Europäischen Gerichtshofs",
                                                                  "Alternative für Deutschland",
                                                                  "Beatrix Storch",
                                                                  "Georg Pazderski",
                                                                  "Jorg Meuthen",
                                                                  "Jörg Meuthen",
                                                                  "Martin Schulz")))

# stem corpus
tokens_AFD_stem<-tokens_wordstem(tokens_AFD_comp,language = "german")
tokens_AFD_stem

# create DFM
AFD_dfm_tm <- dfm(tokens_AFD_stem)
print(AFD_dfm_tm)

# trim dfm
AFD_dfm_tm_trim <- dfm_trim(AFD_dfm_tm, min_termfreq = 10, max_docfreq = 0.2, docfreq_type = "prop")
print(AFD_dfm_tm_trim)

# convert to stm object
stm_prep <- convert(AFD_dfm_tm_trim, to = "stm", docvars = docvars(AFD_dfm_tm_trim))

# create model fit plots
set.seed(90)
k_prediction_one <- searchK(vocab = stm_prep$vocab, 
                            documents = stm_prep$documents, 
                            data = stm_prep$meta, 
                            K = c(3,6,9,12,15,18,21,24,27,30,33,36,39,42,45,48,51,54,57,60),
                            prevalence = ~ sender_clean + Year,
                            init.type = "Spectral",
                            seed = 90
)
plot(k_prediction_one)

# fit stm with 20 topics
stm_20 <- stm(vocab = stm_prep$vocab, 
              documents = stm_prep$documents, 
              data = stm_prep$meta, 
              K = 20,
              prevalence = ~ sender_clean + Year,
              init.type = "Spectral",
              seed = 90)

# plot sample words for each topic
plot.STM(stm_20,
         type = "labels",
         n=8,
         topics = c(1,2,3,4,5))
plot.STM(stm_20,
         type = "labels",
         n=8,
         topics = c(6,7,8,9,10))
plot.STM(stm_20,
         type = "labels",
         n=8,
         topics = c(11,12,13,14,15))
plot.STM(stm_20,
         type = "labels",
         n=8,
         topics = c(16,17,18,19,20))

# name topics
topic_names <- c("Financial_policy",
                 "Foreign_policy",
                 "Eurozone_Greece",
                 "Residual_1",
                 "Turkey",
                 "Islamism_antisemitism",
                 "Democracy",
                 "EU",
                 "Foreign_policy_2",
                 "France",
                 "SPD_CDU",
                 "Intra_party_politics",
                 "Residual_2",
                 "Defence_policy",
                 "Refugees_borders",
                 "Residual_3",
                 "EU_Merkel_refugees",
                 "FDP_Greens",
                 "Societal_policy_integration",
                 "Economic_policy")
plot.STM(stm_20,
         custom.labels = topic_names)

# create dataframe of topic proportions with names from above
thetas <- as.data.frame(stm_20$theta)
colnames(thetas) <- topic_names
topics_stm <- cbind(stm_prep$meta, thetas)

# format date variable
AFD_Data_20_stm_with_collocation<-topics_stm
AFD_Data_20_stm_with_collocation$date <- as.Date(AFD_Data_20_stm_with_collocation$date, format = "%d.%m.%Y")

# average topic proportion over date
Stats <- AFD_Data_20_stm_with_collocation %>% group_by(date) %>% summarise_at(vars(Financial_policy,
                                                                                   Foreign_policy,
                                                                                   Eurozone_Greece,
                                                                                   Residual_1,
                                                                                   Turkey,
                                                                                   Islamism_antisemitism,
                                                                                   Democracy,
                                                                                   EU,
                                                                                   Foreign_policy_2,
                                                                                   France,
                                                                                   SPD_CDU,
                                                                                   Intra_party_politics,
                                                                                   Residual_3,
                                                                                   Defence_policy,
                                                                                   Refugees_borders,
                                                                                   Residual_4,
                                                                                   EU_Merkel_refugees,
                                                                                   FDP_Greens,
                                                                                   Societal_policy_integration,
                                                                                   Economic_policy,
                                                                                   Year), mean)

# join topic model data and wiki data
AFD_Data_20_stm_13_07_22<-left_join(Wiki_Total, Stats,  by="date")
# set data as date type variable
AFD_Data_20_stm_13_07_22$date<-as.Date(AFD_Data_20_stm_13_07_22$date)

# save data
setwd("PATH")
write_dta(AFD_Data_20_stm_13_07_22, "AFD_Data_20_stm_13_07_22.dta")

# create topic proportion over years plot
est_topic <- estimateEffect(c(17, 15, 1, 3) ~ Year, stm_20, stm_prep$meta)
plot.estimateEffect(est_topic, "Year", method = "continuous", printlegend = FALSE, 
                    main = "Topic Change Over Time", 
                    axis(1,at=c(2013,2014,2015,2016,2017,2018,2019),labels=c(2013,2014,2015,2016,2017,2018,2019),las=2))
legend("top", legend=c("EU, Merkel, Refugees", "Refugees & Borders", "Financial Policy", "Eurozone Greece")
       ,col=c("red", "green", "cyan", "purple"), ncol = 1, merge = TRUE, cex = 0.6,
       lty=1)

# 30 topic STM for appendix
stm_30 <- stm(vocab = stm_prep$vocab, 
              documents = stm_prep$documents, 
              data = stm_prep$meta, 
              K = 30,
              prevalence = ~ sender_clean + Year,
              init.type = "Spectral",
              seed = 90)

# plot sample words for each topic
plot.STM(stm_30,
         type = "labels",
         n=8,
         topics = c(1,2,3,4,5))
plot.STM(stm_30,
         type = "labels",
         n=8,
         topics = c(6,7,8,9,10))
plot.STM(stm_30,
         type = "labels",
         n=8,
         topics = c(11,12,13,14,15))
plot.STM(stm_30,
         type = "labels",
         n=8,
         topics = c(16,17,18,19,20))
plot.STM(stm_30,
         type = "labels",
         n=8,
         topics = c(21,22,23,24,25))
plot.STM(stm_30,
         type = "labels",
         n=8,
         topics = c(26,27,28,29,30))